Homework 7

  1. Doing Challenge 4 from Data and Graphics Challenge
InĀ [21]:
import altair as alt
import pandas as pd
InĀ [22]:
countryCodes = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
countryCodes.head()
Out[22]:
name alpha-2 alpha-3 country-code iso_3166-2 region sub-region intermediate-region region-code sub-region-code intermediate-region-code
0 Afghanistan AF AFG 4 ISO 3166-2:AF Asia Southern Asia NaN 142.0 34.0 NaN
1 ƅland Islands AX ALA 248 ISO 3166-2:AX Europe Northern Europe NaN 150.0 154.0 NaN
2 Albania AL ALB 8 ISO 3166-2:AL Europe Southern Europe NaN 150.0 39.0 NaN
3 Algeria DZ DZA 12 ISO 3166-2:DZ Africa Northern Africa NaN 2.0 15.0 NaN
4 American Samoa AS ASM 16 ISO 3166-2:AS Oceania Polynesia NaN 9.0 61.0 NaN
InĀ [31]:
gasolinePrices = pd.read_csv("./pump_price_for_gasoline_us_per_liter.csv")
gasolinePrices.head()
Out[31]:
country 1991 1992 1993 1994 1995 1996 1997 1998 1999 ... 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
0 Afghanistan NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 1.05 NaN 1.15 NaN 1.28 NaN 1.07 NaN 0.7
1 Angola NaN NaN NaN NaN NaN NaN NaN 0.38 NaN ... NaN 0.53 NaN 0.65 NaN 0.63 NaN 0.76 NaN 0.97
2 Albania NaN NaN NaN NaN NaN NaN NaN 0.86 NaN ... NaN 1.36 NaN 1.46 NaN 1.81 NaN 1.76 NaN 1.36
3 Andorra NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 1.24 NaN 1.49 NaN 1.67 NaN 1.51 NaN NaN
4 UAE NaN NaN NaN NaN NaN NaN NaN 0.23 NaN ... NaN 0.45 NaN 0.47 NaN 0.47 NaN 0.47 NaN 0.49

5 rows Ɨ 27 columns

InĀ [32]:
gasPricesLong = pd.melt(gasolinePrices, id_vars=['country'], 
                        var_name='Year', value_name='GasPrice')

gasPricesLong.head()
Out[32]:
country Year GasPrice
0 Afghanistan 1991 NaN
1 Angola 1991 NaN
2 Albania 1991 NaN
3 Andorra 1991 NaN
4 UAE 1991 NaN
InĀ [29]:
country_dropdown = alt.binding_select(options=[None] + sorted(gasPricesLong['country'].unique()), name='country ')
country_selection = alt.selection_point(fields=['country'], bind=country_dropdown, name="Select", empty=True)

explorationChart = alt.Chart(gasPricesLong).mark_point().encode(
    x='Year:O',
    y='GasPrice:Q',
    color='country:N',
    tooltip=['country', 'Year', 'GasPrice']
).transform_filter(
    country_selection
).properties(
    width=600,
    height=400
).add_params(
    country_selection
)

explorationChart
Out[29]:
InĀ [36]:
gasPricesLongSub = pd.merge(gasPricesLong, countryCodes[['name', 'sub-region']], 
                         left_on='country', right_on='name', how='left')

gasPricesLongSub['GasPrice'] = pd.to_numeric(gasPricesLongSub['GasPrice'], errors='coerce')

gasPricesLongSub = gasPricesLongSub.groupby(['sub-region', 'Year'])['GasPrice'].mean().reset_index()

gasPricesLongSub['GasPrice'] = pd.to_numeric(gasPricesLongSub['GasPrice'], errors='coerce')

gasPricesLongSub.head()
Out[36]:
sub-region Year GasPrice
0 Australia and New Zealand 1991 NaN
1 Australia and New Zealand 1992 NaN
2 Australia and New Zealand 1993 NaN
3 Australia and New Zealand 1994 NaN
4 Australia and New Zealand 1995 0.61
InĀ [43]:
sub_dropdown = alt.binding_select(options=[None] + sorted(gasPricesLongSub['sub-region'].unique()), name='Sub Region')
sub_selection = alt.selection_point(fields=['sub-region'], bind=sub_dropdown, name="Select", empty=True)

chart = alt.Chart(gasPricesLongSub).mark_line().encode(
    x='Year:O',
    y='GasPrice:Q', 
    color=alt.Color('sub-region:N', legend=None),  
    tooltip=['sub-region:N', 'Year:O', 'GasPrice:Q']
).transform_filter(
    sub_selection
).properties(
    title='SubRegion Average Gas Price By Year',
    width=500,
    height=500
).transform_filter(
    alt.datum.GasPrice != None
).add_params(
    sub_selection
)

chart
Out[43]:

World Values Survey

  1. How Many Respondents In Each Country
InĀ [44]:
worldValues = pd.read_csv("https://calvin-data304.netlify.app/data/wvs.csv")

worldValues.head()
Out[44]:
sex birth_year birth_country_iso age age6 age3 married married_before country COW_NUM COW_ALPHA democracy_importance wave_chronology ISO_country S004 respondent_number_orig respondent_number_unified weight weight_equilibrated survey_year
0 2 1975.0 9999 43 3 2 1 -4 AUS 900 AUL 9 7 36 -4 36071236 36720001 1.010623 0.551572 2018
1 1 1957.0 36 60 5 3 1 -4 AUS 900 AUL 10 7 36 -4 36070000 36720002 0.651305 0.551572 2018
2 1 1977.0 9999 41 3 2 1 -4 AUS 900 AUL 6 7 36 -4 36070001 36720003 1.116451 0.551572 2018
3 2 1974.0 9999 43 3 2 1 -4 AUS 900 AUL 9 7 36 -4 36070002 36720004 0.591649 0.551572 2018
4 2 1970.0 9999 48 4 2 1 -4 AUS 900 AUL 10 7 36 -4 36070003 36720005 1.589662 0.551572 2018
InĀ [51]:
respondentsCountry = worldValues.groupby('country').size().reset_index(name = "RespondentCount")

respondentsCountry = pd.merge(respondentsCountry, countryCodes[['alpha-3', 'name']], 
                         left_on='country', right_on='alpha-3', how='left')

respondentsCountry.head()
Out[51]:
country RespondentCount alpha-3 name
0 AUS 1773 AUS Australia
1 CAN 4018 CAN Canada
2 DEU 1520 DEU Germany
3 GBR 2399 GBR United Kingdom of Great Britain and Northern I...
4 KOR 1245 KOR Korea, Republic of
InĀ [63]:
respondentGraph = alt.Chart(respondentsCountry).mark_bar().encode(
    x=alt.X('country:N', sort='-y', title=""),
    y=alt.Y('RespondentCount:Q', title="Number of Respondents"),
    color = alt.value('lightblue')
).properties(
    title=alt.Title(text="Respondents by Country", align='right')
)

respondentGraph
Out[63]:

Canada had a greater number of respondents then the rest of the countries and then the US and Great Britain stood out but besides that the rest were relatively similar.

InĀ [74]:
worldValuesAge = worldValues[['country', 'age']]

alt.data_transformers.disable_max_rows()

box_plot = alt.Chart(worldValuesAge).mark_boxplot().encode(
    x=alt.X('age:Q', title = "Age"),
    y=alt.Y('country:N', title=""),
    color=alt.value('lightblue')
).properties(
    title=alt.TitleParams(
        text='Age Distribution by Country',
        align='right'
    )
)

box_plot
Out[74]:
InĀ [78]:
worldValuesAge3 = worldValues[['country', 'age', 'age3']]

worldValuesAge3 = worldValuesAge3.groupby(['country', 'age3'])['age'].agg(['min', 'max']).reset_index()

worldValuesAge3.head()
Out[78]:
country age3 min max
0 AUS 1 17 29
1 AUS 2 30 49
2 AUS 3 50 98
3 CAN 1 18 29
4 CAN 2 30 49
InĀ [86]:
minMax3 = alt.Chart(worldValuesAge3).mark_bar().encode(
    x=alt.X('country:N', title='Country'),
    y=alt.Y('min:Q', title='Age'),
    y2='max:Q',
    color=alt.Color('age3:N', title="Grouping")
)
Out[86]:
InĀ [89]:
worldValuesAge6 = worldValues[['country', 'age', 'age6']]

worldValuesAge6 = worldValuesAge6.groupby(['country', 'age6'])['age'].agg(['min', 'max']).reset_index()

minMax6 = alt.Chart(worldValuesAge6).mark_bar().encode(
    x=alt.X('country:N', title='Country'),
    y=alt.Y('min:Q', title='Age'),
    y2='max:Q',
    color=alt.Color('age6:N', title="Grouping")
)

comparisonGraph = minMax3 | minMax6

comparisonGraph.properties(
    title = alt.TitleParams(
        text= "Age Groupings By Country"
    )
)
Out[89]:

From this chart we can see that the age groups are the same no matter what country.

InĀ [129]:
worldValuesErrorBand = worldValues[['country', 'age6', 'democracy_importance']]

worldValuesErrorBand['response_10'] = worldValuesErrorBand.loc[:, 'democracy_importance'].apply(lambda x: 1 if x == 10 else 0)

base = alt.Chart(worldValuesErrorBand).encode(
    x=alt.X("age6:O", sort='-x',title=""),
    y=alt.Y("response_10:Q", title="")
)

error_band = base.mark_errorband(extent="ci")

lineChart = base.mark_line().encode(
    y=alt.Y("mean(response_10):Q")
)

points = base.mark_circle(size=15).encode(
    y=alt.Y("mean(response_10):Q")
)

combined_chart = error_band + points + lineChart

combined_chart.facet(
    column = alt.Column("country:N", title="")).resolve_scale(y='independent').properties(
    title="Percentage of people who say it is essential to live in a democracy by age group"
)
/var/folders/jz/n5m4rwz17fb2c7rywz1r25_h0000gn/T/ipykernel_47653/3103417639.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  worldValuesErrorBand['response_10'] = worldValuesErrorBand.loc[:, 'democracy_importance'].apply(lambda x: 1 if x == 10 else 0)
Out[129]:
InĀ [99]:
worldValuesErrorBand = worldValues[['country', 'age6', 'democracy_importance']]

worldValuesErrorBandAgg = worldValuesErrorBand.groupby(['country', 'age6'])['democracy_importance'].mean().reset_index()

worldValuesErrorBand.head()
Out[99]:
country age6 democracy_importance
0 AUS 3 9
1 AUS 5 10
2 AUS 3 6
3 AUS 3 9
4 AUS 4 10
InĀ [118]:
base = alt.Chart(worldValuesErrorBand).encode(
    x=alt.X("age6:O", sort='-x',title=""),
    y=alt.Y("democracy_importance:Q", title="")
)

error_band = base.mark_errorband(extent="ci")

lineChart = base.mark_line().encode(
    y=alt.Y("mean(democracy_importance):Q")
)

points = base.mark_circle(size=15).encode(
    y=alt.Y("mean(democracy_importance):Q")
)

combined_chart = error_band + points + lineChart

combined_chart.facet(
    column = alt.Column("country:N", title="")).resolve_scale(y='independent').properties(
    title="Average Score(1-10) of people who say it is essential to live in a democracy by age group"
)
Out[118]:
InĀ [132]:
worldValuesErrorBand = worldValues[['age', 'country', 'democracy_importance']]

base = alt.Chart(worldValuesErrorBand).encode(
    x=alt.X("age:Q", sort='-x',title=""),
    y=alt.Y("democracy_importance:Q", title="")
)

error_band = base.mark_errorband(extent="ci")

lineChart = base.mark_line().encode(
    y=alt.Y("mean(democracy_importance):Q")
)

points = base.mark_circle(size=15).encode(
    y=alt.Y("mean(democracy_importance):Q")
)

combined_chart = error_band + points + lineChart

combined_chart.facet(
    column = alt.Column("country:N", title="")).resolve_scale(y='independent').properties(
    title="Average Score(1-10) of people who say it is essential to live in a democracy by age"
)
Out[132]:

This is worse too many points and overwhelming to read harder to see overall trends.

InĀ [153]:
worldValuesLOESS = worldValues[['age', 'country', 'democracy_importance']]

worldValuesLOESS = worldValuesLOESS.groupby(['age', 'country'])['democracy_importance'].mean().reset_index()

pointChart = alt.Chart(worldValuesLOESS).mark_point().encode(
    x='age:Q',
    y='democracy_importance:Q',
    color='country:N'
)

loessLine = alt.Chart(worldValuesLOESS).transform_loess(
    'age', 'democracy_importance', groupby=['country']
).mark_line(
    size=2
).encode(
    x='age:Q',
    y='democracy_importance:Q',
    color='country:N'
)

finalChart = pointChart + loessLine

finalChart.facet(column="country:N").properties(
    title= alt.TitleParams(
        text="LOESS Regression"
    )
)
Out[153]:
InĀ [154]:
worldValuesLOESS = worldValues[['age', 'country', 'democracy_importance']]

worldValuesLOESS = worldValuesLOESS.groupby(['age', 'country'])['democracy_importance'].mean().reset_index()

pointChart = alt.Chart(worldValuesLOESS).mark_point().encode(
    x='age:Q',
    y='democracy_importance:Q',
    color='country:N'
)

loessLine = alt.Chart(worldValuesLOESS).transform_regression(
    'age', 'democracy_importance', groupby=['country'], method='linear'
).mark_line(
    size=2
).encode(
    x='age:Q',
    y='democracy_importance:Q',
    color='country:N'
)

finalChart = pointChart + loessLine

finalChart.facet(column="country:N").properties(
    title= alt.TitleParams(
        text="Linear Regression"
    )
)
Out[154]:
InĀ [155]:
worldValuesLOESS = worldValues[['age', 'country', 'democracy_importance']]

worldValuesLOESS = worldValuesLOESS.groupby(['age', 'country'])['democracy_importance'].mean().reset_index()

pointChart = alt.Chart(worldValuesLOESS).mark_point().encode(
    x='age:Q',
    y='democracy_importance:Q',
    color='country:N'
)

loessLine = alt.Chart(worldValuesLOESS).transform_regression(
    'age', 'democracy_importance', groupby=['country'], method='poly'
).mark_line(
    size=2
).encode(
    x='age:Q',
    y='democracy_importance:Q',
    color='country:N'
)

finalChart = pointChart + loessLine

finalChart.facet(column="country:N").properties(
    title= alt.TitleParams(
        text="Polynomial Regression"
    )
)
Out[155]: